{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "from sklearn.metrics import roc_auc_score\n",
    "import warnings\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import pandas as pd\n",
    "import gc\n",
    "import os\n",
    "\n",
    "warnings.simplefilter('ignore')\n",
    "%matplotlib inline\n",
    "\n",
    "pd.set_option('max_columns', None)\n",
    "pd.set_option('max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seed = 2020\n",
    "v = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install lightgbm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_feature = pd.read_pickle('data/feature{}.pkl'.format(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for f in df_feature.select_dtypes('object').columns:\n",
    "    if f not in ['user']:\n",
    "        lbl = LabelEncoder()\n",
    "        df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = df_feature[df_feature.label.notna()].copy()\n",
    "df_test = df_feature[df_feature.label.isna()].copy()\n",
    "\n",
    "df_train.shape, df_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ycol = 'label'\n",
    "feature_names = list(\n",
    "    filter(lambda x: x not in [ycol, 'user'], df_train.columns))\n",
    "\n",
    "model = lgb.LGBMClassifier(objective='binary',\n",
    "                           boosting_type='gbdt',\n",
    "                           num_leaves=32,\n",
    "                           max_depth=6,\n",
    "                           learning_rate=0.01,\n",
    "                           n_estimators=10000,\n",
    "                           subsample=0.8,\n",
    "                           feature_fraction=0.6,\n",
    "                           reg_alpha=10,\n",
    "                           reg_lambda=12,\n",
    "                           random_state=seed,\n",
    "                           is_unbalance=True,\n",
    "                           metric='auc')\n",
    "\n",
    "df_oof = df_train[['user', ycol]].copy()\n",
    "df_oof['prob'] = 0\n",
    "prediction = df_test[['user']]\n",
    "prediction['prob'] = 0\n",
    "df_importance_list = []\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)\n",
    "for fold_id, (trn_idx, val_idx) in enumerate(\n",
    "        kfold.split(df_train[feature_names], df_train[ycol])):\n",
    "    X_train = df_train.iloc[trn_idx][feature_names]\n",
    "    Y_train = df_train.iloc[trn_idx][ycol]\n",
    "\n",
    "    X_val = df_train.iloc[val_idx][feature_names]\n",
    "    Y_val = df_train.iloc[val_idx][ycol]\n",
    "\n",
    "    print('\\nFold_{} Training ================================\\n'.format(\n",
    "        fold_id + 1))\n",
    "\n",
    "    lgb_model = model.fit(X_train,\n",
    "                          Y_train,\n",
    "                          eval_names=['train', 'valid'],\n",
    "                          eval_set=[(X_train, Y_train), (X_val, Y_val)],\n",
    "                          verbose=100,\n",
    "                          early_stopping_rounds=50)\n",
    "\n",
    "    pred_val = lgb_model.predict_proba(\n",
    "        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]\n",
    "    df_oof.loc[val_idx, 'prob'] = pred_val\n",
    "\n",
    "    pred_test = lgb_model.predict_proba(\n",
    "        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]\n",
    "    prediction['prob'] += pred_test / kfold.n_splits\n",
    "\n",
    "    df_importance = pd.DataFrame({\n",
    "        'column': feature_names,\n",
    "        'importance': lgb_model.feature_importances_,\n",
    "    })\n",
    "    df_importance_list.append(df_importance)\n",
    "\n",
    "    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_importance = pd.concat(df_importance_list)\n",
    "df_importance = df_importance.groupby([\n",
    "    'column'\n",
    "])['importance'].agg('mean').sort_values(ascending=False).reset_index()\n",
    "df_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "auc = roc_auc_score(df_oof[ycol], df_oof['prob'])\n",
    "print('auc:', auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 0.737320648758525\n",
    "# 0.7372858742331708"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs('sub', exist_ok=True)\n",
    "prediction.to_csv('sub/yizhifu_{}.csv'.format(auc), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs('prob', exist_ok=True)\n",
    "\n",
    "prediction.to_csv('prob/sub_lgb{}.csv'.format(v), index=False)\n",
    "df_oof[['user', 'prob', ycol]].to_csv('prob/oof_lgb{}.csv'.format(v), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_oof.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
